smallRNA Differential Expression

Author

Lance Parsons

Load libraries

This project uses renv to keep track of installed packages. Install renv if not installed and load dependencies with renv::restore().

install.packages("renv")
renv::restore()

Read sample data

Get list of samples

Code
samples <- read_tsv("config/samples.tsv",
  col_types = list(
    sample_name = col_character(),
    cell_line = col_factor(),
    exogenous_rna = col_factor(),
    day = col_factor()
  )
)
units <- read_tsv("config/units.tsv",
  col_types = list(
    sample_name = col_character(),
    unit_name = col_character(),
    fq1 = col_character(),
    fq2 = col_character()
  )
)
sample_units <- dplyr::left_join(samples, units, by = "sample_name") %>%
  unite(sample_unit, sample_name, unit_name, remove = FALSE) %>%
  mutate(
    day = as.factor(paste0("day", day)),
    batch = as.factor(case_when(
      exogenous_rna == "mastermix1" ~ "batch3",
      exogenous_rna == "mastermix2" ~ "batch3",
      TRUE ~ "batch2"
    )),
    .before = cell_line,
  )

sample_units

Table of exogenous RNA mixtures

Code
# Exogenous RNA mixtures
rna_mixes <- tibble()
for (mix in c("mastermix1", "mastermix2", "PJY103_mDNMT1", "PJY300_mDNMT1")) {
  t <- Biostrings::readDNAStringSet(sprintf("data/references/%s.fa", mix))
  rna_mixes <- rbind(rna_mixes, tibble(
    exogenous_rna = mix,
    rna_species = word(t@ranges@NAMES, 1),
    start = 1,
    end = t@ranges@width
  ))
}

rna_mixes <- rna_mixes %>%
  mutate(
    active_cis_start_max = case_when(
      exogenous_rna == "PJY103_mDNMT1" ~ 16,
      exogenous_rna == "PJY300_mDNMT1" ~ 16,
      exogenous_rna == "mastermix1" ~ 15,
      exogenous_rna == "mastermix2" ~ 15
    ),
    active_cis_end_min = case_when(
      exogenous_rna == "PJY103_mDNMT1" ~ 74,
      exogenous_rna == "PJY300_mDNMT1" ~ 74,
      exogenous_rna == "mastermix1" ~ 73,
      exogenous_rna == "mastermix2" ~ 73
    ),
    inactive_ct_end_max = case_when(
      exogenous_rna == "PJY103_mDNMT1" ~ 38,
      exogenous_rna == "PJY300_mDNMT1" ~ 38,
      exogenous_rna == "mastermix1" ~ 37,
      exogenous_rna == "mastermix2" ~ 37
    ),
    active_trans_start_max = case_when(
      rna_species == "PJY103_mDNMT1" ~ 115,
      rna_species == "PJY300_mDNMT1" ~ 115,
      rna_species == "PJY179_FANCF" ~ 119,
      rna_species == "PJY181_HEK3" ~ 120,
      rna_species == "PJY182_HEK3" ~ 120,
      rna_species == "PJY183_DNMT1" ~ 113,
      rna_species == "PJY184_DNMT1" ~ 113,
      rna_species == "PJY186_RUNX1" ~ 117,
      rna_species == "PJY185_RUNX1" ~ 117,
      rna_species == "PJY187_VEGFA" ~ 124,
      rna_species == "PJY306_EMX1" ~ 118,
      rna_species == "PJY302_EMX1" ~ 118,
      rna_species == "PJY177_RNF2" ~ 120
    ),
    active_trans_end_min = case_when(
      rna_species == "PJY103_mDNMT1" ~ 121,
      rna_species == "PJY300_mDNMT1" ~ 121,
      rna_species == "PJY179_FANCF" ~ 124,
      rna_species == "PJY181_HEK3" ~ 121,
      rna_species == "PJY182_HEK3" ~ 121,
      rna_species == "PJY183_DNMT1" ~ 118,
      rna_species == "PJY184_DNMT1" ~ 118,
      rna_species == "PJY186_RUNX1" ~ 122,
      rna_species == "PJY185_RUNX1" ~ 122,
      rna_species == "PJY187_VEGFA" ~ 129,
      rna_species == "PJY306_EMX1" ~ 123,
      rna_species == "PJY302_EMX1" ~ 123,
      rna_species == "PJY177_RNF2" ~ 121
    ),
  )

rna_mixes

Exogenous RNA counts

BAM reading functions

Code
rna_species_plot_range <- function(sequence_name) {
  plot_range <- rna_mixes %>%
    dplyr::filter(.data$rna_species == sequence_name) %>%
    dplyr::select(start, end) %>%
    unique()
  return(plot_range)
}

# GRanges from BAM
granges_from_bam <- function(sample_unit,
                             sequence_name,
                             is_proper_pair = TRUE,
                             bam_dir =
                               "results/alignments/exogenous_rna/sorted") {
  plot_range <- rna_species_plot_range(sequence_name)
  which <- GRanges(
    sprintf("%s:%i-%i", sequence_name, plot_range$start, plot_range$end)
  )

  param <- ScanBamParam(
    flag = scanBamFlag(isProperPair = is_proper_pair),
    mapqFilter = 1,
    which = which
  )

  aligned_fragments_list <- list()

  if (is_proper_pair) {
    aligned_fragments <- granges(
      readGAlignmentPairs(
        sprintf("%s/%s.bam", bam_dir, sample_unit),
        param = param
      ),
      on.discordant.seqnames = "drop"
    )
    rna_info <- rna_mixes %>%
      dplyr::filter(rna_species == {{ sequence_name }}) %>%
      dplyr::select(-"exogenous_rna") %>%
      unique()

    # Active in cis
    aligned_fragments_list$active_cis <- aligned_fragments %>%
      plyranges::filter(
        start <= rna_info$active_cis_start_max,
        end >= rna_info$active_cis_end_min
      )

    # Active in trans
    aligned_fragments_list$active_trans <- aligned_fragments %>%
      plyranges::filter(
        start > rna_info$active_cis_start_max,
        start <= rna_info$active_trans_start_max,
        end >= rna_info$active_trans_end_min
      )

    # Inactive Cryptic Termination
    aligned_fragments_list$inactive_cryptic_terminiation <-
      aligned_fragments %>%
      plyranges::filter(end < rna_info$inactive_ct_end_max)

    # Inactive Other
    aligned_fragments_list$inactive_other <- plyranges::bind_ranges(
      aligned_fragments %>%
        plyranges::filter(
          start <= rna_info$active_cis_start_max,
          end < rna_info$active_cis_end_min,
          end > rna_info$inactive_ct_end_max
        ),
      aligned_fragments %>%
        plyranges::filter(
          start > rna_info$active_cis_start_max,
          (
            start > rna_info$active_trans_start_max |
              end < rna_info$active_trans_end_min
          )
        ),
    )
  } else {
    aligned_fragments <- granges(
      readGAlignments(
        sprintf("%s/%s.bam", bam_dir, sample_unit),
        param = param
      )
    )
    aligned_fragments_list$discordant <- aligned_fragments
  }
  return(aligned_fragments_list)
}

Read BAM coverage

Code
rna_species_plot_data <- list()
for (rna_species_to_plot in rna_mixes$rna_species) {
  rna_info <- rna_mixes %>%
    dplyr::filter(rna_species == rna_species_to_plot)

  sample_units_to_plot <- sample_units %>%
    dplyr::filter(exogenous_rna %in% rna_info$exogenous_rna)

  granges_to_plot <- list()
  for (sample_unit in sample_units_to_plot$sample_unit) {
    granges_to_plot[[sample_unit]] <- granges_from_bam(
      sample_unit,
      rna_species_to_plot,
      TRUE
    )
  }
  rna_species_plot_data[[rna_species_to_plot]] <- granges_to_plot
}

Organize coverage data

Code
exogenous_rna_count_data <- tibble(
  sample_unit = character(),
  sequence_name = character(),
  category = character(),
  count = numeric()
)
for (rna_species in names(rna_species_plot_data)) {
  for (sample_unit in names(rna_species_plot_data[[rna_species]])) {
    for (category in
      names(rna_species_plot_data[[rna_species]][[sample_unit]])) {
      exogenous_rna_count_data <- exogenous_rna_count_data %>%
        add_row(
          sample_unit = sample_unit,
          sequence_name = rna_species,
          category = category,
          count = length(
            rna_species_plot_data[[rna_species]][[sample_unit]][[category]]
          )
        )
    }
  }
}
exogenous_rna_count_data

Summarize coverage data

Code
exogenous_rna_mapped_totals <- exogenous_rna_count_data %>%
  group_by(sample_unit, sequence_name) %>%
  summarize(mapped_fragments = sum(count))
`summarise()` has grouped output by 'sample_unit'. You can override using the
`.groups` argument.
Code
exogenous_rna_mapped_totals

Human small RNA gene counts

  • count only fragments that were properly aligned
  • annotate with GENCODE gene model
  • primary alignments were counted, even if the fragments aligned multiple times
  • fragments aligning to multiple features were assigned to the feature that mostly closely overlapped with the fragment
  • exogenous RNA counts are total fragments that aligned
Code
human_counts_dir <- "results/smrna_count/"
human_gene_counts_files <- paste0(
  human_counts_dir,
  sample_units$sample_unit,
  "_first_proper_pair_gene_count.txt"
)

human_gene_counts <- readr::read_tsv(
  human_gene_counts_files[1],
  comment = "#",
  col_names = c("gene", human_gene_counts_files[1]),
  col_types = "ci"
)

for (i in 2:length(human_gene_counts_files)) {
  gene_sample <-
    readr::read_tsv(
      human_gene_counts_files[i],
      comment = "#",
      col_names = c("gene", human_gene_counts_files[i]),
      col_types = "ci"
    )
  human_gene_counts <- human_gene_counts %>%
    dplyr::full_join(gene_sample, by = "gene")
}

human_gene_counts <- human_gene_counts %>%
  rename_all(~ stringr::str_replace_all(
    ., human_counts_dir, ""
  )) %>%
  rename_all(~ str_replace_all(., "_first_proper_pair_gene_count.txt", ""))

exogenous_gene_counts_category <- exogenous_rna_count_data %>%
  tidyr::unite(gene, c(sequence_name, category), sep = ":") %>%
  tidyr::spread(sample_unit, count)

exogenous_gene_counts_total <- exogenous_rna_mapped_totals %>%
  dplyr::mutate(gene = paste(sequence_name, "total", sep = ":")) %>%
  dplyr::select(-"sequence_name") %>%
  tidyr::spread(sample_unit, mapped_fragments)

gene_counts <- rbind(
  human_gene_counts,
  exogenous_gene_counts_category,
  exogenous_gene_counts_total
)

gene_counts

Exogenous RNA “gene” names

Code
# List of exogenous genes to highlight
exogenous_rna_names <- gene_counts %>%
  dplyr::filter(str_detect(gene, "^PJY")) %>%
  pull(gene)
exogenous_rna_names
 [1] "PJY103_mDNMT1:active_cis"                   
 [2] "PJY103_mDNMT1:active_trans"                 
 [3] "PJY103_mDNMT1:inactive_cryptic_terminiation"
 [4] "PJY103_mDNMT1:inactive_other"               
 [5] "PJY177_RNF2:active_cis"                     
 [6] "PJY177_RNF2:active_trans"                   
 [7] "PJY177_RNF2:inactive_cryptic_terminiation"  
 [8] "PJY177_RNF2:inactive_other"                 
 [9] "PJY179_FANCF:active_cis"                    
[10] "PJY179_FANCF:active_trans"                  
[11] "PJY179_FANCF:inactive_cryptic_terminiation" 
[12] "PJY179_FANCF:inactive_other"                
[13] "PJY181_HEK3:active_cis"                     
[14] "PJY181_HEK3:active_trans"                   
[15] "PJY181_HEK3:inactive_cryptic_terminiation"  
[16] "PJY181_HEK3:inactive_other"                 
[17] "PJY182_HEK3:active_cis"                     
[18] "PJY182_HEK3:active_trans"                   
[19] "PJY182_HEK3:inactive_cryptic_terminiation"  
[20] "PJY182_HEK3:inactive_other"                 
[21] "PJY183_DNMT1:active_cis"                    
[22] "PJY183_DNMT1:active_trans"                  
[23] "PJY183_DNMT1:inactive_cryptic_terminiation" 
[24] "PJY183_DNMT1:inactive_other"                
[25] "PJY184_DNMT1:active_cis"                    
[26] "PJY184_DNMT1:active_trans"                  
[27] "PJY184_DNMT1:inactive_cryptic_terminiation" 
[28] "PJY184_DNMT1:inactive_other"                
[29] "PJY185_RUNX1:active_cis"                    
[30] "PJY185_RUNX1:active_trans"                  
[31] "PJY185_RUNX1:inactive_cryptic_terminiation" 
[32] "PJY185_RUNX1:inactive_other"                
[33] "PJY186_RUNX1:active_cis"                    
[34] "PJY186_RUNX1:active_trans"                  
[35] "PJY186_RUNX1:inactive_cryptic_terminiation" 
[36] "PJY186_RUNX1:inactive_other"                
[37] "PJY187_VEGFA:active_cis"                    
[38] "PJY187_VEGFA:active_trans"                  
[39] "PJY187_VEGFA:inactive_cryptic_terminiation" 
[40] "PJY187_VEGFA:inactive_other"                
[41] "PJY300_mDNMT1:active_cis"                   
[42] "PJY300_mDNMT1:active_trans"                 
[43] "PJY300_mDNMT1:inactive_cryptic_terminiation"
[44] "PJY300_mDNMT1:inactive_other"               
[45] "PJY302_EMX1:active_cis"                     
[46] "PJY302_EMX1:active_trans"                   
[47] "PJY302_EMX1:inactive_cryptic_terminiation"  
[48] "PJY302_EMX1:inactive_other"                 
[49] "PJY306_EMX1:active_cis"                     
[50] "PJY306_EMX1:active_trans"                   
[51] "PJY306_EMX1:inactive_cryptic_terminiation"  
[52] "PJY306_EMX1:inactive_other"                 
[53] "PJY103_mDNMT1:total"                        
[54] "PJY177_RNF2:total"                          
[55] "PJY179_FANCF:total"                         
[56] "PJY181_HEK3:total"                          
[57] "PJY182_HEK3:total"                          
[58] "PJY183_DNMT1:total"                         
[59] "PJY184_DNMT1:total"                         
[60] "PJY185_RUNX1:total"                         
[61] "PJY186_RUNX1:total"                         
[62] "PJY187_VEGFA:total"                         
[63] "PJY300_mDNMT1:total"                        
[64] "PJY302_EMX1:total"                          
[65] "PJY306_EMX1:total"                          

Import sample metadata and counts into DESeq2

Read these counts into DESeq2 along with the sample metadata.

Set the design to ~ exogenous_rna + day + cell_line.

Code
dds <- DESeqDataSetFromMatrix(
  countData = gene_counts %>%
    tibble::column_to_rownames("gene") %>%
    replace(is.na(.), 0),
  colData = sample_units,
  design = ~ exogenous_rna + day + cell_line
)
converting counts to integer mode
Code
dds
class: DESeqDataSet 
dim: 56752 56 
metadata(1): version
assays(1): counts
rownames(56752): GAS5 SNORD30 ... PJY302_EMX1:total PJY306_EMX1:total
rowData names(0):
colnames(56): Parental_mastermix1_day1_rep1
  Parental_mastermix1_day1_rep2 ... Parental_PJY300_epegRNA_day2_rep3
  Parental_PJY300_epegRNA_day2_rep4
colData names(9): sample_unit sample_name ... fq1 fq2

Sample comparisons

Variance Stabalized Transformation

Code
vsd <- vst(dds, blind = FALSE)

Heatmap of sample-to-sample distances

Code
sample_dists <- dist(t(assay(vsd)))

sample_dist_matrix <- as.matrix(sample_dists)
rownames(sample_dist_matrix) <- paste(vsd$cell_line,
  vsd$exogenous_rna,
  vsd$day,
  sep = "-"
)
colnames(sample_dist_matrix) <- NULL
Code
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sample_dist_matrix,
  clustering_distance_rows = sample_dists,
  clustering_distance_cols = sample_dists,
  col = colors
)

PCA plot of samples

Code
plotPCA(vsd, intgroup = c("cell_line", "exogenous_rna", "day"))

Differential Expression

Batch 2 - Day 1

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch2_day1 <- subset(dds, select = colData(dds)$batch == "batch2" &
  colData(dds)$day == "day1")
dds_batch2_day1$exogenous_rna <- droplevels(dds_batch2_day1$exogenous_rna)
dds_batch2_day1$day <- droplevels(dds_batch2_day1$day)
design(dds_batch2_day1) <- ~ exogenous_rna + cell_line
dds_batch2_day1 <- DESeq(dds_batch2_day1, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch2_day1
class: DESeqDataSet 
dim: 56752 16 
metadata(1): version
assays(4): counts mu H cooks
rownames(56752): GAS5 SNORD30 ... PJY302_EMX1:total PJY306_EMX1:total
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(16): P1E10_sorted_PJY103_pegRNA_day1_rep1
  P1E10_sorted_PJY103_pegRNA_day1_rep2 ...
  Parental_PJY300_epegRNA_day1_rep3 Parental_PJY300_epegRNA_day1_rep4
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch2_day1 <- results(dds_batch2_day1, alpha = 0.05)
res_batch2_day1
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56752 rows and 6 columns
                     baseMean log2FoldChange     lfcSE      stat      pvalue
                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
GAS5                  3358381      0.1558718 0.0336216  4.636061 3.55110e-06
SNORD30               1605167      0.0910242 0.0577513  1.576141 1.14993e-01
SNORD49A              1156164      0.1842127 0.0437292  4.212578 2.52473e-05
SNORD26                857547      0.0954226 0.0540887  1.764186 7.77006e-02
SNHG1                  780897     -0.0217632 0.0401173 -0.542488 5.87482e-01
...                       ...            ...       ...       ...         ...
PJY186_RUNX1:total          0             NA        NA        NA          NA
PJY187_VEGFA:total          0             NA        NA        NA          NA
PJY300_mDNMT1:total    192298      0.0560998 0.0433756   1.29335    0.195891
PJY302_EMX1:total           0             NA        NA        NA          NA
PJY306_EMX1:total           0             NA        NA        NA          NA
                           padj
                      <numeric>
GAS5                8.06429e-05
SNORD30             4.52981e-01
SNORD49A            4.94390e-04
SNORD26             3.62830e-01
SNHG1               8.79975e-01
...                         ...
PJY186_RUNX1:total           NA
PJY187_VEGFA:total           NA
PJY300_mDNMT1:total    0.588356
PJY302_EMX1:total            NA
PJY306_EMX1:total            NA
Code
summary(res_batch2_day1)

out of 51625 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 937, 1.8%
LFC < 0 (down)     : 1751, 3.4%
outliers [1]       : 1, 0.0019%
low counts [2]     : 23896, 46%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch2_day1_lfc <- lfcShrink(dds_batch2_day1,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch2_day1_lfc_df <- res_batch2_day1_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch2_day1_lfc_df, file = "diffexp_results_batch2_day1.tsv")

res_batch2_day1_lfc_labelled <- res_batch2_day1_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit() %>%
  arrange(gene)

ggplot(
  res_batch2_day1_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, colour = significant)
) +
  geom_point() +
  scale_colour_hue(c = 45, l = 80, na.value = "#cccccc") +
  geom_point(
    data = res_batch2_day1_lfc_labelled,
    color = "red", size = 3
  ) +
  ggrepel::geom_label_repel(
    data = res_batch2_day1_lfc_labelled,
    aes(label = gene, segment.colour = "black"),
    min.segment.length = 0,
    show.legend = FALSE
  ) +
  theme_bw()
Warning: Removed 5127 rows containing missing values (geom_point).

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch2_day1_lfc_labelled$gene) {
  gene_prefix <- str_split(gene, "_")[[1]][1]
  d <- plotCounts(dds_batch2_day1,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(gene_prefix, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list, ncol = 5)

Batch 2 - Day 2

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch2_day2 <- subset(dds, select = colData(dds)$batch == "batch2" &
  colData(dds)$day == "day2")
dds_batch2_day2$exogenous_rna <- droplevels(dds_batch2_day2$exogenous_rna)
dds_batch2_day2$day <- droplevels(dds_batch2_day2$day)
design(dds_batch2_day2) <- ~ exogenous_rna + cell_line
dds_batch2_day2 <- DESeq(dds_batch2_day2, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch2_day2
class: DESeqDataSet 
dim: 56752 16 
metadata(1): version
assays(4): counts mu H cooks
rownames(56752): GAS5 SNORD30 ... PJY302_EMX1:total PJY306_EMX1:total
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(16): P1E10_sorted_PJY103_pegRNA_day2_rep1
  P1E10_sorted_PJY103_pegRNA_day2_rep2 ...
  Parental_PJY300_epegRNA_day2_rep3 Parental_PJY300_epegRNA_day2_rep4
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch2_day2 <- results(dds_batch2_day2, alpha = 0.05)
res_batch2_day2
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56752 rows and 6 columns
                     baseMean log2FoldChange     lfcSE      stat      pvalue
                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
GAS5                  3192415     0.18567762 0.0303354  6.120823 9.30933e-10
SNORD30               1514330     0.15582664 0.0512274  3.041861 2.35120e-03
SNORD49A              1170290     0.19302856 0.0428252  4.507358 6.56398e-06
SNORD26                825436     0.09512922 0.0398426  2.387627 1.69575e-02
SNHG1                  678111     0.00437696 0.0311625  0.140456 8.88300e-01
...                       ...            ...       ...       ...         ...
PJY186_RUNX1:total          0             NA        NA        NA          NA
PJY187_VEGFA:total          0             NA        NA        NA          NA
PJY300_mDNMT1:total    129466       0.402798 0.0695254   5.79354 6.89182e-09
PJY302_EMX1:total           0             NA        NA        NA          NA
PJY306_EMX1:total           0             NA        NA        NA          NA
                           padj
                      <numeric>
GAS5                3.28203e-08
SNORD30             2.70614e-02
SNORD49A            1.37614e-04
SNORD26             1.31104e-01
SNHG1               9.76136e-01
...                         ...
PJY186_RUNX1:total           NA
PJY187_VEGFA:total           NA
PJY300_mDNMT1:total 2.20407e-07
PJY302_EMX1:total            NA
PJY306_EMX1:total            NA
Code
summary(res_batch2_day2)

out of 50458 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 880, 1.7%
LFC < 0 (down)     : 1599, 3.2%
outliers [1]       : 53, 0.11%
low counts [2]     : 25268, 50%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch2_day2_lfc <- lfcShrink(dds_batch2_day2,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch2_day2_lfc_df <- res_batch2_day2_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch2_day2_lfc_df, file = "diffexp_results_batch2_day2.tsv")

res_batch2_day2_lfc_labelled <- res_batch2_day2_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit() %>%
  arrange(gene)

ggplot(
  res_batch2_day2_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, color = significant)
) +
  geom_point() +
  scale_colour_hue(c = 45, l = 80, na.value = "#cccccc") +
  geom_point(
    data = res_batch2_day2_lfc_labelled,
    color = "red", size = 3
  ) +
  ggrepel::geom_label_repel(
    data = res_batch2_day2_lfc_labelled,
    aes(label = gene, segment.colour = "black"),
    min.segment.length = 0,
    show.legend = FALSE
  ) +
  theme_bw()
Warning: Removed 6294 rows containing missing values (geom_point).

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch2_day2_lfc_labelled$gene) {
  gene_prefix <- str_split(gene, "_")[[1]][1]
  d <- plotCounts(dds_batch2_day2,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(gene_prefix, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list, ncol = 5)

Batch 3 - Day 1

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch3_day1 <- subset(dds, select = colData(dds)$batch == "batch3" &
  colData(dds)$day == "day1")
dds_batch3_day1$exogenous_rna <- droplevels(dds_batch3_day1$exogenous_rna)
dds_batch3_day1$day <- droplevels(dds_batch3_day1$day)
design(dds_batch3_day1) <- ~ exogenous_rna + cell_line
dds_batch3_day1 <- DESeq(dds_batch3_day1, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch3_day1
class: DESeqDataSet 
dim: 56752 12 
metadata(1): version
assays(4): counts mu H cooks
rownames(56752): GAS5 SNORD30 ... PJY302_EMX1:total PJY306_EMX1:total
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(12): Parental_mastermix1_day1_rep1
  Parental_mastermix1_day1_rep2 ... P1E10_sorted_mastermix2_day1_rep2
  P1E10_sorted_mastermix2_day1_rep3
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch3_day1 <- results(dds_batch3_day1, alpha = 0.05)
res_batch3_day1
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56752 rows and 6 columns
                     baseMean log2FoldChange     lfcSE      stat      pvalue
                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
GAS5                  3961130      0.2447062 0.0340043  7.196341 6.18496e-13
SNORD30               1821966      0.2592061 0.0384898  6.734412 1.64595e-11
SNORD49A              1512043      0.2499911 0.0374016  6.683973 2.32550e-11
SNORD26                979013      0.2163973 0.0334927  6.461028 1.03994e-10
SNHG1                  822981      0.0133071 0.0372694  0.357051 7.21054e-01
...                       ...            ...       ...       ...         ...
PJY186_RUNX1:total    1633.47      -0.291022 0.0662930  -4.38994 1.13383e-05
PJY187_VEGFA:total   33192.68      -0.497089 0.0468994 -10.59904 3.01056e-26
PJY300_mDNMT1:total      0.00             NA        NA        NA          NA
PJY302_EMX1:total    12940.32      -1.371394 0.0776867 -17.65288 9.66899e-70
PJY306_EMX1:total     3907.85       0.629852 0.0636490   9.89571 4.34535e-23
                           padj
                      <numeric>
GAS5                2.25664e-11
SNORD30             5.34896e-10
SNORD49A            7.42573e-10
SNORD26             3.08768e-09
SNHG1               9.27768e-01
...                         ...
PJY186_RUNX1:total  1.83827e-04
PJY187_VEGFA:total  2.52806e-24
PJY300_mDNMT1:total          NA
PJY302_EMX1:total   2.79904e-67
PJY306_EMX1:total   3.21893e-21
Code
summary(res_batch3_day1)

out of 49790 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 866, 1.7%
LFC < 0 (down)     : 1750, 3.5%
outliers [1]       : 1, 0.002%
low counts [2]     : 27788, 56%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch3_day1_lfc <- lfcShrink(dds_batch3_day1,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch3_day1_lfc_df <- res_batch3_day1_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch3_day1_lfc_df, file = "diffexp_results_batch3_day1.tsv")

res_batch3_day1_lfc_labelled <- res_batch3_day1_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit() %>%
  arrange(gene)

ggplot(
  res_batch3_day1_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, colour = significant)
) +
  geom_point() +
  scale_colour_hue(c = 45, l = 80, na.value = "#cccccc") +
  geom_point(
    data = res_batch3_day1_lfc_labelled,
    color = "red", size = 3
  ) +
  ggrepel::geom_label_repel(
    data = res_batch3_day1_lfc_labelled,
    aes(label = gene, segment.colour = "black"),
    min.segment.length = 0,
    max.overlaps = 30,
    size = 2.5,
    show.legend = FALSE
  ) +
  theme_bw()
Warning: Removed 6962 rows containing missing values (geom_point).

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch3_day1_lfc_labelled$gene) {
  gene_rna_type <- str_split(gene, ":")[[1]][1]
  exogenous_rna <- rna_mixes %>%
    dplyr::filter(rna_species == gene_rna_type) %>%
    pull(exogenous_rna)
  exogenous_rna_regex <- paste(exogenous_rna, collapse = "|")

  d <- plotCounts(dds_batch3_day1,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(exogenous_rna_regex, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list, ncol = 5)

Batch 3 - Day 2

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch3_day2 <- subset(dds, select = colData(dds)$batch == "batch3" &
  colData(dds)$day == "day2")
dds_batch3_day2$exogenous_rna <- droplevels(dds_batch3_day2$exogenous_rna)
dds_batch3_day2$day <- droplevels(dds_batch3_day2$day)
design(dds_batch3_day2) <- ~ exogenous_rna + cell_line
dds_batch3_day2 <- DESeq(dds_batch3_day2, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch3_day2
class: DESeqDataSet 
dim: 56752 12 
metadata(1): version
assays(4): counts mu H cooks
rownames(56752): GAS5 SNORD30 ... PJY302_EMX1:total PJY306_EMX1:total
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(12): Parental_mastermix1_day2_rep1
  Parental_mastermix1_day2_rep2 ... P1E10_sorted_mastermix2_day2_rep2
  P1E10_sorted_mastermix2_day2_rep3
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch3_day2 <- results(dds_batch3_day2, alpha = 0.05)
res_batch3_day2
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56752 rows and 6 columns
                     baseMean log2FoldChange     lfcSE      stat      pvalue
                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
GAS5                  4358523      0.3666020 0.0345411 10.613505 2.57891e-26
SNORD30               2397546      0.3290985 0.0412322  7.981584 1.44467e-15
SNORD49A              1753257      0.4152087 0.0482040  8.613582 7.08122e-18
SNORD26               1116921      0.2294077 0.0409224  5.605922 2.07150e-08
SNHG1                  818059      0.0379581 0.0421218  0.901149 3.67509e-01
...                       ...            ...       ...       ...         ...
PJY186_RUNX1:total    2526.37       0.150186 0.1232145   1.21890 2.22882e-01
PJY187_VEGFA:total   40381.76      -0.913386 0.0865453 -10.55385 4.87567e-26
PJY300_mDNMT1:total      0.00             NA        NA        NA          NA
PJY302_EMX1:total    19546.57      -1.393833 0.1128495 -12.35126 4.79558e-35
PJY306_EMX1:total     5823.63       0.714436 0.0875378   8.16145 3.31029e-16
                           padj
                      <numeric>
GAS5                2.26228e-24
SNORD30             6.45430e-14
SNORD49A            3.79920e-16
SNORD26             5.01921e-07
SNHG1               7.15722e-01
...                         ...
PJY186_RUNX1:total  5.70464e-01
PJY187_VEGFA:total  4.17052e-24
PJY300_mDNMT1:total          NA
PJY302_EMX1:total   5.79229e-33
PJY306_EMX1:total   1.56935e-14
Code
summary(res_batch3_day2)

out of 50065 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 1171, 2.3%
LFC < 0 (down)     : 2142, 4.3%
outliers [1]       : 6, 0.012%
low counts [2]     : 26023, 52%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch3_day2_lfc <- lfcShrink(dds_batch3_day2,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch3_day2_lfc_df <- res_batch3_day2_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch3_day2_lfc_df, file = "diffexp_results_batch3_day2.tsv")

res_batch3_day2_lfc_labelled <- res_batch3_day2_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit() %>%
  arrange(gene)

ggplot(
  res_batch3_day2_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, color = significant)
) +
  geom_point() +
  scale_colour_hue(c = 45, l = 80, na.value = "#cccccc") +
  geom_point(
    data = res_batch3_day2_lfc_labelled,
    color = "red", size = 3
  ) +
  ggrepel::geom_label_repel(
    data = res_batch3_day2_lfc_labelled,
    mapping = aes(label = gene, segment.colour = "black"),
    min.segment.length = 0,
    max.overlaps = 30,
    size = 2.5,
    show.legend = FALSE
  ) +
  theme_bw()
Warning: Removed 6687 rows containing missing values (geom_point).
Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch3_day2_lfc_labelled$gene) {
  gene_rna_type <- str_split(gene, ":")[[1]][1]
  exogenous_rna <- rna_mixes %>%
    dplyr::filter(rna_species == gene_rna_type) %>%
    pull(exogenous_rna)
  exogenous_rna_regex <- paste(exogenous_rna, collapse = "|")

  d <- plotCounts(dds_batch3_day2,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(exogenous_rna_regex, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list, ncol = 5)